This code was used for the analysis of Cariaco Basin metagenomes
and metatranscriptomes for the 2022 paper by Geller-McGrath et al. Some
sections of this analysis were done on a compute node of the Poseidon
High Performance Computing (HPC) cluster based at the Woods Hole
Oceanographic Institute (WHOI). All of the analyses done here using R
can be run locally; HPC processing was used to speed up
computations.
Load required libraries, create matrix of counts. Each column will represent a metagenomic sample and rows will correspond to metagenome-assembled genomes (MAGs). Each cell will contain the number of reads that mapped to the contigs of a MAG.
library(tidyverse) # loads several useful data-wrangling and plotting libraries
library(furrr) # library of purrr-style iterators that utilize multiple cores for parallel processing
library(DESeq2) # library for running differential abundance analysis
library(gridtext) # for editing text in heatmap plots
library(ComplexHeatmap) # for heatmap plots
library(pheatmap) # for heatmap plots
library(cowplot) # for combining various plots together in grids
library(BiocParallel) # for DESeq2 parallel processing
library(ggtext) # used for ggplots
library(glue) # used for ggplots
plan(multicore, workers = 35) # 35 cores allotted to furrr's multicore iterators
options(future.globals.maxSize = 5242880000) #5000 * 1024^2; 5Gb memory allotted to each core
setwd('/vortexfs1/omics/pachiadaki/dgellermcgrath/cariaco-work-051121/antismash6_cariaco_rdata/FINAL_PRODIGAL_AND_ANTISMASH_OUTPUTS_012422/coverM/final_counts_of_reads_mapped_to_genomes/')
files = system('ls *.tsv', intern = TRUE)
sample_names = str_replace(files, '_counts_reads_mapped_per_genome.tsv', '')
# Create a list of read-mapping results (in tibble format); mapped reads to the MAGs (all contigs)
# Note: furrr's iterators are implemented like purrr's with the added 'future_' prepended to the function name; it will call the iterations in parallel across multiple cores
mapped_read_counts = future_map2(
files,
sample_names, ~
vroom::vroom(
file = .x,
delim = '\t',
col_names = c('mag_name', .y),
skip = 1,
show_col_types = FALSE
)
)
# Join all read-mapping results together into a tibble by the column of MAG names in each list component
mapped_read_counts =
reduce(mapped_read_counts, left_join, by = 'mag_name')
Load additional data related to the differential abundance
analysis, prep the DESeq2 object and run DESeq2
# Load a tibble with taxonomic and assembly information for each MAG
mag_table <- readRDS('/vortexfs1/omics/pachiadaki/dgellermcgrath/cariaco-work-051121/antismash6_cariaco_rdata/FINAL_PRODIGAL_AND_ANTISMASH_OUTPUTS_012422/coverM/deseq2_dnaseq_rdata/draftMag-taxonomic-table-cariaco.RDA')
# Create metadata tibble for the metagenomic samples
sampleNames_key <- tibble(
sample_name = colnames(mapped_read_counts)[2:ncol(mapped_read_counts)]) %>%
mutate(depth = str_replace(sample_name, '.*(\\d{3}).*', '\\1'),
season = case_when(
(depth == 103 | depth == 198 | depth == 234 | depth == 295 | depth == 314) ~ 'M',
(depth == 143 | depth == 200 | depth == 237 | depth == 247 | depth == 267) ~ 'N',
str_detect(sample_name, 'M') ~ 'M',
str_detect(sample_name, 'N') ~ 'N'),
fraction = if_else(str_detect(sample_name, 'A'), 'PA', 'FL'),
replicate = if_else(str_detect(sample_name, 'a'), 1, 2),
new_column_names = paste0(season, fraction, depth, '_', replicate)) %>%
arrange(factor(sample_name, levels = colnames(select(mapped_read_counts, -mag_name))))
all(sampleNames_key$sample_name == colnames(select(mapped_read_counts, -mag_name))) #TRUE
# Create sample metadata for use in the DESeq() function. This analysis will look for differential abundance patterns between fractions in three different chemically distinct water layers: the oxycline, shallow anoxic, and euxinic depths
coldata <- sampleNames_key %>%
select(depth, fraction, new_column_names) %>%
mutate(layer = case_when(depth == 900 ~ 'euxinic',
depth >= 103 & depth <= 237 ~ 'oxycline',
depth > 237 & depth < 900 ~ 'shallow.anoxic')) %>%
select(-depth) %>%
dplyr::rename(sample = new_column_names) %>%
unite(group, c(fraction, layer), sep = '_') %>%
relocate(sample) %>%
mutate(group = as_factor(group))
# Rename the mapped-read counts tibble with metagenomic sample naming scheme used in the paper
mapped_read_counts = mapped_read_counts %>%
rename_with(
~ sampleNames_key$new_column_names,
.cols = 2:last_col()) %>%
column_to_rownames(var = 'mag_name')
# Create the DESeq2 analysis object
da_dds <- DESeq2::DESeqDataSetFromMatrix(countData = mapped_read_counts,
colData = coldata,
design = ~ group)
# Run the DESeq2 statistical model on the data; assumes the data have a negative binomial distribution
da_final_dds <- DESeq(da_dds)
Extract the results from the oxycline, shallow anoxic, and
euxinic depths that compare PA and FL fraction types
# pull Wald test results comparing the oxycline PA to FL fractions
oxycline <- results(da_final_dds, contrast = c('group',
'PA_oxycline', 'FL_oxycline'), alpha = 0.05)
# pull Wald test results comparing the euxinic PA to FL fractions
euxinic <- results(da_final_dds, contrast = c('group',
'PA_euxinic', 'FL_euxinic'), alpha = 0.05)
# pull Wald test results comparing the shallow anoxic PA to FL fractions
shallow_anoxic <- results(da_final_dds, contrast = c('group', 'PA_shallow.anoxic',
'FL_shallow.anoxic'), alpha = 0.05)
# create a comprehensive tibble including all differential abundance results
# classify MAG fraction "preference" based on the direction of log2FoldChange and the adjusted p-values
allDaResults <- list(oxycline = oxycline,
shallow_anoxic = shallow_anoxic,
euxinic = euxinic) %>%
imap(~ .x %>%
as.data.frame() %>%
rownames_to_column(var = 'mag_name') %>%
mutate(mag_name = str_replace(mag_name, 'CarAnox_mtb2', 'MAG')) %>%
as_tibble() %>%
mutate(preference = case_when(log2FoldChange > 0 & padj < 0.05 ~ 'particle enriched',
log2FoldChange < 0 & padj < 0.05 ~ 'free-living enriched',
padj > 0.05 | is.na(padj) ~ 'no significant difference'),
layer = .y) %>%
left_join(mag_table %>% select(mag_name, phylum), by = 'mag_name') %>%
mutate(phylum = if_else(is.na(phylum), 'Unclassified', phylum),
preference = factor(preference,
levels = c(
'particle enriched',
'free-living enriched',
'no significant difference'))))
# save the differential abundance final results tibble
saveRDS(allDaResults, file = '/vortexfs1/omics/pachiadaki/dgellermcgrath/cariaco-work-051121/antismash6_cariaco_rdata/FINAL_PRODIGAL_AND_ANTISMASH_OUTPUTS_012422/coverM/deseq2_dnaseq_rdata/deseq2_results_by_water_layer.rds')
Plot the differential abundance results
library(tidyverse) # loads several useful data-wrangling and plotting libraries
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.5
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(gridtext) # for editing text in heatmap plots
library(ComplexHeatmap) # for heatmap plots
## Loading required package: grid
## ========================================
## ComplexHeatmap version 2.13.1
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
##
## If you use it in published research, please cite either one:
## - Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional
## genomic data. Bioinformatics 2016.
## - Gu, Z. Complex Heatmap Visualization. iMeta 2022.
##
##
## The new InteractiveComplexHeatmap package can directly export static
## complex heatmaps into an interactive Shiny app with zero effort. Have a try!
##
## This message can be suppressed by:
## suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================
##
##
## Attaching package: 'ComplexHeatmap'
##
## The following object is masked from 'package:gridtext':
##
## textbox_grob
library(pheatmap) # for heatmap plots
##
## Attaching package: 'pheatmap'
##
## The following object is masked from 'package:ComplexHeatmap':
##
## pheatmap
library(cowplot) # for combining various plots together in grids
library(ggtext) # used for ggplots
library(glue) # used for ggplots
#locally load DESeq2 results
allDaResults = readRDS('~/Documents/cariaco-tidy-analysis/rdata-files/deseq2_results_by_water_layer.rds')
## plotting
magFractionPref <- allDaResults %>%
map_dfr(~
.x %>%
mutate(layer = case_when(
layer == 'oxycline' ~ 'Oxycline',
layer == 'euxinic' ~ 'Euxinic',
layer == 'shallow_anoxic' ~ 'Shallow anoxic',
))
)
magFractionPref_plot = magFractionPref %>%
ggplot(aes(y = fct_rev(fct_infreq(phylum)),
fill = fct_relevel(preference, c('no significant difference',
'free-living enriched',
'particle enriched')))) +
geom_bar() +
labs(x = '\nNumber of genomes', y = 'Phylum\n', fill = 'Fraction preference') +
theme_bw() +
scale_fill_manual(values = c('grey80', 'palegreen3', 'burlywood2')) +
facet_wrap(~ fct_relevel(layer, c('Oxycline', 'Shallow anoxic', 'Euxinic'))) +
theme(
axis.title = element_text(size = 10),
strip.text = element_text(size = 10),
strip.background = element_rect(fill = 'wheat1'),
panel.grid = element_blank(),
legend.position = 'top',
legend.justification = c(0.92,0)
)
magFractionPref_plot
## Processing of RNA-Seq read-mapping .paf files from
Minimap2
Process the .paf output files, filter poor alignments, save the results
# load required libraries
library(tidyverse)
library(furrr)
#set parallel processing params
plan(multicore, workers = 48)
options(future.globals.maxSize = 52428800000)
# metadata file of gene names
gene_metadata = tibble(
gene_name = readLines('/vortexfs1/omics/pachiadaki/dgellermcgrath/cariaco-work-051121/antismash6_cariaco_rdata/FINAL_PRODIGAL_AND_ANTISMASH_OUTPUTS_012422/FINAL_MAY_2022_ALL_ANTISMASH6_CONCAT_GENE_FASTA_INDEX/final_concat_gene_names.txt')
) %>%
separate(gene_name,
into = c('gene_name', 'start', 'end', 'strand', 'extra_data'),
sep = ' # ')
gene_metadata = gene_metadata %>%
mutate(gene_name = str_replace(gene_name, '^>', ''))
empty_gene_counts_filler = gene_metadata %>%
select(gene_name) %>%
mutate(n_reads_mapped = 0L)
rm(gene_metadata)
# set working directory to the one with the .paf output files
setwd('/vortexfs1/omics/pachiadaki/dgellermcgrath/cariaco-work-051121/antismash6_cariaco_rdata/FINAL_PRODIGAL_AND_ANTISMASH_OUTPUTS_012422/minimap2_metaT_mapping_final_results_used')
# create vector of .paf full file paths
file_paths = system('ls *.paf', intern = TRUE)
sample_names = str_replace(file_paths, '.paf', '')
#function to process the .paf files
prepare_read_mappings = function(.empty_gene_counts_filler, .file_path) {
# extract metaT sample name from .file_path
.sample_name = str_replace(.file_path, '.paf', '')
# read the .paf file into memory
mapping_results = vroom::vroom(
.file_path,
col_names = c(
c(
'qname', 'qlen', 'qstart', 'qend',
'strand', 'tname', 'tlen', 'tstart',
'tend', 'nmatch', 'alen', 'mapq',
'X13', 'X14', 'X15', 'X16', 'X17'
)
))
# select cols 1-12
mapping_results = select(mapping_results, 1:12)
# filter to keep only certain alignment results
mapping_results = mapping_results %>%
filter(
alen >= 0.5 * qlen, # the alignment length is at least 50% of the read length
nmatch >= 0.95 * alen # at least 95% of the aligned bases are a match
)
mapping_results = mapping_results %>%
group_by(tname) %>% # create groupped tibble; groupped by target sequences (genes)
summarize(n_reads_mapped = n()) # aggregate the tibble, summing up counts of mapped reads
# bind the counts of mapped reads to genes with all the other genes, counted zero times
mapping_results = mapping_results %>%
rename(gene_name = tname) %>%
bind_rows(
filter(empty_gene_counts_filler, !(gene_name %in% .$gene_name))
)
}
# process .paf files using parallel processing
processed_mapping_counts = future_map2(
list(empty_gene_counts_filler),
file_paths, ~
prepare_read_mappings(.x, .y)
)
# save the list of processed files
saveRDS(processed_mapping_counts, file = 'rdata/list_of_mapped_metaT_reads_from_each_sample.rds')
Finish processing the read-mapping results, calculate normalized
Transcript Per Million (TPM) values for the final dataset
# create tibble that has all metatranscriptome read-mapping results joined together
processed_mapping_counts =
reduce(processed_mapping_counts, left_join, by = 'gene_name') %>%
rename_with( # rename the meteatranscriptome sample columns based on naming scheme used in the paper
~ sample_names,
.cols = 2:last_col()) %>%
mutate(gene_name = str_replace(
gene_name, '(MAG_\\d+)_(\\d+_\\d+)', '\\1-ctg\\2'))
#read in gene lengths data
gl = readRDS('/vortexfs1/omics/pachiadaki/dgellermcgrath/cariaco-work-051121/antismash6_cariaco_rdata/FINAL_PRODIGAL_AND_ANTISMASH_OUTPUTS_012422/rdata/gene_lengths_tibble.rda')
gl = gl %>%
mutate(gene_length = gene_length / 1000) # convert unit length to kilobases
# join gene length data to read-mapping data
# divide each read-mapping count by the length of the gene that it mapped to
processed_mapping_counts = processed_mapping_counts %>%
left_join(gl, by = 'gene_name') %>%
relocate(gene_length, .after = 1) %>%
mutate(across(3:last_col(), ~ .x / gene_length))
# function to convert RNA-seq counts to TPM; TPM is normalized across and within samples
# the counts tibble has already had each cell divided by gene length
# this function will calculate the TPM scaling factor for each column and divide each cell in a column by the scaling factor, for all columns
get_tpm_for = function(col) {
scaling_factor = sum(col) / 1e6
col = col / scaling_factor
return(col)
}
# finalize the TPM normalization
processed_mapping_counts = processed_mapping_counts %>%
select(-gene_length) %>%
mutate(across(2:last_col(), ~ get_tpm_for(.x)))
# read in biosynthetic gene cluster (BGC) metadata, parsed from antiSMASH 6 output
bgc = vroom::vroom(
'/vortexfs1/omics/pachiadaki/dgellermcgrath/cariaco-work-051121/antismash6_cariaco_rdata/FINAL_PRODIGAL_AND_ANTISMASH_OUTPUTS_012422/salmon_related_data_files/bgc_genes_from_10kb_clusters.txt',
show_col_types = FALSE)
bgc = bgc %>%
mutate(gene_name = str_replace(
gene_name,
'(^ctg\\d+_\\d+)-(MAG.*)', '\\2-\\1'
))
# filter the TPM-normalized read-mapping data to retain only biosynthetic gene cluster
# transcript mappings from biosynthetic clusters that were at least 10 kilobases in size
processed_mapping_counts = processed_mapping_counts %>%
filter(gene_name %in% bgc$gene_name)
# save the output
#saveRDS(processed_mapping_counts, file = 'rdata/tpm_for_bgc_genes_minimap2_res.rds')
## Processing and of DNA-Seq mapping results for MAG relative
abundance plotting
Process the CoverM relative abundance output files, plot relative abundance of the most abundant MAGs
mag_table = readRDS('~/Documents/cariaco-tidy-analysis/rdata-files/mag_table.rds')
final_genome_rel_abun_tbl = readRDS('~/Documents/cariaco-tidy-analysis/rdata-files/rel_abun_tibble_48_samples.rds')
# create sample names key
metaG_sampleNames_key.X = tibble(
sample_name = colnames(final_genome_rel_abun_tbl)[2:ncol(final_genome_rel_abun_tbl)],
depth = str_replace(sample_name, '.*(\\d{3}).*', '\\1'),
fraction = if_else(str_detect(sample_name, 'A'), 'PA', 'FL'),
season = case_when(
str_detect(sample_name, 'AM|BM') ~ 'M',
str_detect(sample_name, 'AN|BN') ~ 'N',
depth %in% c(103,198,234,295,314) ~ 'M',
depth %in% c(143,200,237,247,267) ~ 'N'),
replicate = if_else(str_detect(sample_name, 'a'), 1, 2),
new_sample_name = paste0(season, fraction, depth, '_', replicate)
)
all(colnames(final_genome_rel_abun_tbl)[2:ncol(final_genome_rel_abun_tbl)] == metaG_sampleNames_key.X$sample_name)
## [1] TRUE
#TRUE
# color palette listing the color choices for all annotation rows/columns on the heatmap
relPalette <- list(Layer = c(
'Oxycline' = 'darkslategray3',
'Shallow anoxic' = 'gold',
'Euxinic' = 'sienna3'),
Fraction = c('PA' = 'darkgreen',
'FL' = 'darkseagreen2'),
Season = c('May' = 'slategray',
'November' = 'slategray1'),
`Depth (m)` = c(
'103' = 'grey90',
'143' = 'grey85',
'198' = 'grey80',
'200' = 'grey75',
'234' = 'grey70',
'237' = 'grey65',
'247' = 'grey60',
'267' = 'grey55',
'295' = 'grey50',
'314' = 'grey45',
'900' = 'grey20'
),
`Phylum/Class` = c('Planctomycetota' = 'wheat1',
'Desulfobacterota' = 'tomato',
'Myxococcota' = 'orange',
'Alphaproteobacteria' = 'orchid1',
'Gammaproteobacteria' = 'thistle1',
'Chloroflexota' = 'slateblue1',
'Marinisomatota' = 'aquamarine',
'Aenigmarchaeota' = 'gold',
'Krumholzibacteriota' = 'darkred',
'Omnitrophota' = 'darkviolet',
'Verrucomicrobiota' = 'khaki',
'Thermoplasmatota' = 'firebrick1',
'AABM5-125-24' = 'gainsboro',
'Bacteroidota' = 'springgreen',
'SAR324' = 'chocolate4',
'Cloacimonadota' = 'grey28',
'Crenarchaeota' = 'yellowgreen',
'Acidobacteriota' = 'deepskyblue',
'Actinobacteriota' = '#666666',
'Eisenbacteria' = '#B89B74',
'Gemmatimonadota' = 'lightsteelblue1',
'Altiarchaeota' = 'ivory',
'Nitrospinota' = '#2A7FB7',
'Fermentibacterota' = '#1B9E77',
'Latescibacterota' = '#AD4C9E',
'Patescibacteria' = 'dodgerblue4',
'Undinarchaeota' = 'turquoise',
'Unclassified' = '#5A8950'
))
mag_table = mag_table %>%
mutate(phylum = if_else(is.na(phylum), 'Unclassified', phylum))
final_genome_rel_abun_tbl =
final_genome_rel_abun_tbl %>%
rename(mag_name = Genome) %>%
filter(!mag_name %in% c('unmapped', paste0('CarAnox_mtb2_', c(1507, 983, 769)))) %>%
mutate(mag_name = str_replace(mag_name, 'CarAnox_mtb2', 'MAG')) %>%
left_join(
mag_table %>%
select(mag_name, tax_num),
by = 'mag_name'
) %>%
relocate(tax_num, .after = 1) %>%
arrange(tax_num) %>%
select(-mag_name) %>%
column_to_rownames(var = 'tax_num') %>%
mutate(across(1:last_col(), ~ log1p(.x)), # log-transform relative abundance percentages
row_sums = rowSums(.[1:ncol(.)])) %>%
filter(row_sums > 0.005 * max(row_sums)) %>% # keep only the most abundant MAGs across the samples
select(-row_sums) %>%
rename_with(
~ metaG_sampleNames_key.X$new_sample_name,
.cols = 1:last_col()) %>%
relocate(
metaG_sampleNames_key.X %>%
arrange(desc(fraction), depth) %>%
pull(new_sample_name),
.after = 1)
# create character vector of phylums of the most abundant MAGs, sorted by the most to the least frequently appearing phylum
# split Proteobacteria in Alpha- and Gammaproteobacteria classes
phylum_order =
final_genome_rel_abun_tbl %>%
mutate(row_sums = rowSums(.),
tax_num = rownames(.)) %>%
left_join(
mag_table %>%
select(tax_num, phylum, class),
by = 'tax_num') %>%
select(c(row_sums, tax_num, phylum, class)) %>%
mutate(phylum = if_else(phylum == 'Proteobacteria', class, phylum)) %>%
select(-class) %>%
group_by(phylum) %>%
mutate(group_sum = sum(row_sums)) %>%
ungroup() %>%
arrange(desc(group_sum), phylum) %>%
pull(phylum) %>%
unique()
#change UAP2
phylum_order[28] = 'Undinarchaeota'
# create final relative abundance matrix to be used for plotting
final_genome_rel_abun_tbl =
final_genome_rel_abun_tbl %>%
mutate(row_sums = rowSums(.),
tax_num = rownames(.)) %>%
left_join(
mag_table %>%
select(tax_num, phylum, class),
by = 'tax_num') %>%
mutate(phylum = if_else(
phylum == 'Proteobacteria', class, phylum)) %>%
select(-class) %>%
group_by(phylum) %>%
mutate(group_sum = sum(row_sums)) %>%
ungroup() %>%
arrange(desc(group_sum), phylum) %>%
select(-c(row_sums, group_sum, phylum)) %>%
column_to_rownames(var = 'tax_num') %>%
relocate( # function rearrange the columns by season, and water layer - incrementally increasing in depth
colnames(.) %>%
{function(x) {
tbl = tibble(
mpa = x[str_detect(x, 'MPA')],
npa = sort(c('NPA143_2', x[str_detect(x, 'NPA')])),
mfl = x[str_detect(x, 'MFL')],
nfl = x[str_detect(x, 'NFL')]
) %>%
mutate(groupper = rep(paste0('group_',
seq(1, 6, by = 1)),
each = 2)) %>%
group_by(groupper) %>%
summarize(across(everything(), ~
paste0(.x, collapse = ';')),
.groups = 'drop') %>%
select(-groupper)
pa = as.vector(rbind(tbl$mpa, tbl$npa))
fl = as.vector(rbind(tbl$mfl, tbl$nfl))
g = c(pa, fl)
g = str_split(g, pattern = ';') %>% unlist()
g = g[g != 'NPA143_2']
return(g)}}()
) %>%
as.matrix()
# row metadata for relative abundance plot - the phylum/class of each MAG
row_anno =
final_genome_rel_abun_tbl %>%
as.data.frame() %>%
rownames_to_column(var = 'tax_num') %>%
select(tax_num) %>%
left_join(
mag_table %>%
select(tax_num, phylum, class),
by = 'tax_num') %>%
as_tibble() %>%
mutate(phylum = if_else(
phylum == 'Proteobacteria', class, phylum)) %>%
select(-class) %>%
mutate(phylum = if_else(phylum == 'UAP2', 'Undinarchaeota', phylum)) %>%
rename(`Phylum/Class` = phylum) %>%
arrange(factor(`Phylum/Class`, levels = phylum_order)) %>%
column_to_rownames(var = 'tax_num')
# sort the Phylum/Class color palette by frequency
relPalette$`Phylum/Class` = relPalette$`Phylum/Class`[unique(row_anno$`Phylum/Class`)]
# column metadata for the relative abundance heatmap; annotation codings for sample fraction, sampling season, water layer, and water depth
col_anno = tibble(
cols = colnames(final_genome_rel_abun_tbl),
Fraction = str_replace(cols, '[A-Z]{1}([A-Z]{2})\\d{3}.*', '\\1'),
Season = if_else(str_detect(cols, 'M'), 'May', 'November'),
Layer = str_replace(cols, '[A-Z]+(\\d+)_.*', '\\1')
) %>%
mutate(Layer = as.integer(Layer),
Layer = case_when(
Layer < 247 ~ 'Oxycline',
Layer >= 247 & Layer < 900 ~ 'Shallow anoxic',
TRUE ~ 'Euxinic'
)) %>%
mutate(`Depth (m)` = str_replace(
cols, '.*(\\d{3}).*', '\\1'
)) %>%
column_to_rownames(var = 'cols') %>%
relocate(`Depth (m)`, Fraction, Season, Layer)
#
final_genome_rel_abun_plot = final_genome_rel_abun_tbl %>%
pheatmap::pheatmap(
name = 'Relative abundance (% total reads)',
angle_col = '315',
fontsize = 8,
annotation_row = row_anno,
annotation_col = col_anno,
annotation_colors = relPalette,
show_colnames = FALSE,
gaps_col = 23,
clustering_method = 'ward.D',
cluster_rows = FALSE,
cluster_cols = FALSE,
show_rownames = FALSE,
treeheight_row = 0,
treeheight_col = 0,
color =
c(colorRampPalette(colors = 'white')(1),
colorRampPalette(colors = c(
'#ebf4f7',
'#aae6fa',
'#f7e96d',
'#f7e43b',
'orange',
'#ff6a00',
'firebrick1',
'darkorchid3',
'purple4'))(3000)),
legend_breaks = seq(0, 3.5, by = 0.5),
legend_labels = gt_render(c(
'0% ',
'0.64% ',
'1.72% ',
'3.48% ',
'6.39% ',
'11.18% ',
'19.09% ',
'32.12% '))
)
#ggsave(filename = '~/Documents/cariaco-tidy-analysis/FINAL_FIGURES_DIR/LAST_final_rel_abun_june_2022.svg',
# plot = final_genome_rel_abun_plot,
# width = 7,
# height = 10)
final_genome_rel_abun_plot
## Functionally annotating genes within BGCs, plotting
biosynthetic genes and transcripts by function
Format antibiotic resistance gene annotation and expression data, save the data as a supplementary table, plot the genomic potential/transcript expression
# formatting antibiotic resistance gene data ------------------------------
bnh_summary = readRDS('~/Documents/cariaco-tidy-analysis/rdata-files/bnh_summary.rds')
smc_palette = readRDS('~/Documents/cariaco-tidy-analysis/rdata-files/smc_palette.rds')
ar = readRDS('~/Documents/cariaco-tidy-analysis/rdata-files/antibiotic-resistance-hmm-scan-results.rda')
load('~/Documents/cariaco-tidy-analysis/rdata-files/sm_cluster_gene_desriptive_tibbles_from_anti6_cluster_gbk_outputs_march_2022.rdata')
# loads:
# clust_gene_descr
# clust_gene_summary
pa_ar_expr_plot = readRDS('~/Documents/cariaco-tidy-analysis/rdata-files/pa_ar_expr_plot.rds')
fl_ar_expr_plot = readRDS('~/Documents/cariaco-tidy-analysis/rdata-files/fl_ar_expr_plot.rds')
clust_gene_summary = clust_gene_summary %>%
mutate(mag_name = str_replace(filename, '(MAG_\\d+)_.*', '\\1'), .before = 1) %>%
rename(gene_identifier = gene)
ar = ar %>%
select(cluster_name, query_name, sequence_evalue,
gene_identifier, start_to_end, strand) %>%
filter(sequence_evalue < 1e-5) %>% # filter antibiotic resistance HMM hits to keep those with e-value less than 1e-05
mutate(mag_name = str_replace(cluster_name, '(MAG_\\d+)_.*', '\\1'), .before = 1) %>%
left_join(clust_gene_summary, by = c('mag_name', 'gene_identifier')) %>%
select(-c(product, evalue, aSDomain, description)) %>%
mutate(gene_name = paste0('ctg', ID, '-', mag_name), .after = 1)
# this is the finished version of the antibiotic resistance genes present in the gene clusters
ar =
ar %>%
group_by(gene_name) %>%
filter(sequence_evalue == min(sequence_evalue)) %>%
ungroup() %>%
group_by(mag_name, gene_name, contig) %>%
summarize(gene_fns = paste0(query_name, collapse = ';'),
start_end_summary = paste0(start_to_end, collapse = ';'),
.groups = 'drop') %>%
left_join(mag_table %>%
select(mag_name, domain, phylum),
by = 'mag_name') %>%
left_join(bnh_summary %>%
select(contig, is_hybrid, product),
by = 'contig')
#remove data from clusters that are not greater than 10kb in length
ar = ar %>%
filter(!is.na(product))
#import resfams metadata
resfams_metadata = read_csv('~/Downloads/180102_resfams_metadata_updated_v1.2.2.csv') %>%
select(-c(4:5)) %>%
rename(gene_fns = 2,
binned_gene_fns = 6) %>%
mutate(gene_fns = case_when(
gene_fns == 'RNDAntibioticEffluxPump' ~ 'RND_efflux',
gene_fns == 'MFSAntibioticEffluxPump' ~ 'MFS_efflux',
gene_fns == 'Tetracycline_Resistance_Ribosomal_Protection_Protein' ~ 'tet_ribosomoal_protect',
gene_fns == 'Chloramphenicol_Acetyltransferase_CAT' ~ 'Chlor_Acetyltrans_CAT',
gene_fns == 'ABCAntibioticEffluxPump' ~ 'ABC_efflux',
gene_fns == 'Chloramphenicol_Phosphotransferase_CPT' ~ 'Chlor_Phospho_CPT',
gene_fns == 'FluoroquinoloneResistantDNATopoisomerase' ~ 'Fluor_Res_DNA_Topo',
gene_fns == 'Erm23SRibosomalRNAMethyltransferase' ~ 'Erm23S_rRNA_methyltrans',
gene_fns == 'Cfr23RibosomalRNAMethyltransferase' ~ 'Cfr23_rRNA_methyltrans',
gene_fns == 'QuinoloneResistanceProteinQnr' ~ 'Qnr',
gene_fns == 'MacrolideGlycosyltransfer' ~ 'macrolide_glycosyl',
gene_fns == 'Tetracycline_Resistance_MFS_Efflux_Pump' ~ 'tet_MFS_efflux',
gene_fns == 'CPT' ~ 'Chlor_Phospho_CPT',
gene_fns == 'Chloramphenicol_Efflux_Pump' ~ 'Chlor_Efflux_Pump',
gene_fns == '16S_Ribosomal_RNA_Methyltransferase' ~ '16S_rRNA_methyltrans',
TRUE ~ gene_fns
))
## Rows: 173 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): ResfamID, Resfam Family Name, Description, !!!OLD CARD ARO Identif...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ar_final = ar %>%
mutate(gene_fns = str_replace(gene_fns, '(.*);.*', '\\1')) %>%
group_by(gene_fns) %>%
add_tally() %>%
left_join(
resfams_metadata %>%
select(2, 6) %>%
rename(gene_fns = 1,
binned_gene_fns = 2),
by = 'gene_fns') %>%
ungroup() %>%
mutate(product = case_when(
str_detect(product, regex('hybrid', ignore_case = TRUE)) ~ 'Hybrid cluster',
product == 'Other' ~ 'Other cluster*',
TRUE ~ product
))
ar_gene_suppl_table = ar_final %>%
separate(
start_end_summary,
into = c('start_position_on_contig', 'end_position_on_contig'),
sep = ':') %>%
rename(
hmm_annotation = gene_fns,
resfam_functional_category = binned_gene_fns,
from_hybrid_cluster = is_hybrid) %>%
select(-n) %>%
relocate(resfam_functional_category, .after = hmm_annotation) %>%
relocate(c(domain, phylum), .after = mag_name)
## Warning: Expected 2 pieces. Additional pieces discarded in 2 rows [939, 1548].
# save .tsv of antibiotic resistance gene annotations
#vroom::vroom_write(ar_gene_suppl_table, file = 'stables/antibiotic_resistance_genes_metadata_suppl_table.tsv')
# plotting ----------------------------------------------------------------
# create plot of all antibiotic resistance genes from all high-quality MAGs
total_ar_plot =
ar %>%
mutate(gene_fns = str_replace(gene_fns, '(.*);.*', '\\1')) %>%
left_join(resfams_metadata %>%
select(gene_fns, binned_gene_fns),
by = 'gene_fns') %>%
mutate(product = case_when(
str_detect(product, regex('hybrid', ignore_case = FALSE)) ~ 'Hybrid cluster',
product == 'Other' ~ 'Other cluster*',
product == 'PKS' ~ 'Polyketide',
TRUE ~ product)) %>%
filter(
!str_detect(binned_gene_fns,
'Nucleotidyltransferase|Other|Quinolone Resistance|Acetyltransferase|Phosphotransferase')) %>%
mutate(group = 'Antibiotic resistance genes'
#'Total antibiotic resistance gene count by biosynthetic clusters class'
) %>%
group_by(binned_gene_fns, product, group) %>%
mutate(number = n()) %>%
select(binned_gene_fns, product, group, number) %>%
ungroup() %>%
arrange(binned_gene_fns, product) %>%
distinct() %>%
mutate(binned_gene_fns = if_else(
binned_gene_fns == 'Gylcopeptide Resistance',
'Glycopeptide Resistance', binned_gene_fns
)) %>%
ggplot(aes(
y = factor(binned_gene_fns,
levels = rev(c('Antibiotic Inactivation', 'Target Protection',
'Glycopeptide Resistance', 'Beta-Lactamase',
'MFS Transporter', 'rRNA Methyltransferase',
'RND Antibiotic Efflux', 'ABC Transporter',
'Gene Modulating Resistance'))),
x = number,
fill = fct_rev(fct_infreq(product)))) +
geom_bar(stat = 'identity', position = 'stack') +
theme_bw() +
theme(
axis.ticks.y = element_blank(),
panel.grid = element_blank(),
axis.title.y = element_blank(),
axis.text.y = element_text(size = 10),
axis.text.x = element_text(size = 10),
strip.background = element_rect(fill = "wheat1"),
legend.position = c(0.99, 0.975),
strip.text = element_text(size = 11),
legend.title = element_text(size = 9),
legend.text = element_text(size = 8),
legend.justification = c(1, 0.99),
legend.background = element_blank(),
panel.border = element_rect(color = 'black', fill = NA),
legend.box.background = element_rect(color = 'black'),
#legend.position = 'none' ## removes it
) +
labs(y = 'Number of genes\n', x = '\nTotal genes\n',
fill = 'Biosynthetic cluster class') +
#scale_y_continuous(breaks = seq(0, 600, by = 50)) +
scale_fill_manual(values = smc_palette) +
guides(fill = guide_legend(ncol = 3)) +
facet_wrap(~ group, scales = 'free') +
scale_x_continuous(breaks = seq(0, 600, by = 150))
# plot of transcripts detected in PA metatranscriptomes - groupped by antibiotic resistance type
pa_final_antibiotic_expr_plot = pa_ar_expr_plot$data %>%
mutate(product = if_else(product == 'PKS', 'Polyketide', product)) %>%
select(binned_gene_fns, product) %>%
mutate(group = 'Particle-associated') %>%
group_by(binned_gene_fns, product, group) %>%
mutate(number = n()) %>%
ungroup() %>%
arrange(binned_gene_fns, product) %>%
distinct() %>%
filter(
!str_detect(binned_gene_fns,
'Nucleotidyltransferase|Other Efflux|Quinolone Resistance|Acetyltransferase|Phosphotransferase')) %>%
mutate(binned_gene_fns = if_else(
binned_gene_fns == 'Gylcopeptide Resistance',
'Glycopeptide Resistance', binned_gene_fns)) %>%
ggplot(aes(
y = factor(binned_gene_fns,
levels = rev(c('Antibiotic Inactivation', 'Target Protection',
'Glycopeptide Resistance', 'Beta-Lactamase',
'MFS Transporter', 'rRNA Methyltransferase',
'RND Antibiotic Efflux', 'ABC Transporter',
'Gene Modulating Resistance'))),
x = number,
fill = fct_rev(fct_infreq(factor(product, exclude = NULL))))) +
geom_bar(stat = 'identity', position = 'stack') +
theme_bw() +
facet_wrap(~ group) +
theme_bw() +
theme(
strip.text = element_text(size = 11),
axis.text.x = element_text(size = 10),
axis.title.y = element_blank(),
axis.text.y = element_blank(),
legend.position = 'none',
strip.background = element_rect(fill = "wheat1"),
panel.grid = element_blank(),
axis.ticks.y = element_blank()
) +
scale_fill_manual(values = c(smc_palette), na.value = 'white') +
labs(
fill = 'Secondary metabolite class',
y = 'Antibiotic resistance type',
x = '\nTotal transcripts'
)
# plot of transcripts detected in FL metatranscriptomes - groupped by antibiotic resistance type
fl_final_antibiotic_expr_plot =
fl_ar_expr_plot$data |>
mutate(product = if_else(product == 'PKS', 'Polyketide', product)) |>
select(binned_gene_fns, product) |>
mutate(group = 'Free-living') |>
group_by(binned_gene_fns, product, group) |>
mutate(number = n()) |>
ungroup() |>
arrange(binned_gene_fns, product) |>
distinct() |>
filter(
!str_detect(binned_gene_fns,
'Nucleotidyltransferase|Other Efflux|Quinolone Resistance|Acetyltransferase|Phosphotransferase')) %>%
mutate(binned_gene_fns = if_else(
binned_gene_fns == 'Gylcopeptide Resistance',
'Glycopeptide Resistance', binned_gene_fns
)) |>
ggplot(aes(
y = factor(binned_gene_fns,
levels = rev(c('Antibiotic Inactivation', 'Target Protection',
'Glycopeptide Resistance', 'Beta-Lactamase',
'MFS Transporter', 'rRNA Methyltransferase',
'RND Antibiotic Efflux', 'ABC Transporter',
'Gene Modulating Resistance'))),
x = number,
fill = fct_rev(fct_infreq(factor(product, exclude = NULL))))) +
geom_bar(stat = 'identity', position = 'stack') +
theme_bw() +
facet_wrap(~ group) +
theme_bw() +
theme(
strip.text = element_text(size = 10),
axis.text.x = element_text(size = 10),
axis.title.y = element_blank(),
strip.background = element_rect(fill = "wheat1"),
panel.grid = element_blank(),
axis.ticks.y = element_blank(),
legend.position = 'none') +
scale_fill_manual(values = c(smc_palette), na.value = 'white') +
labs(
fill = 'Secondary metabolite class',
y = 'Antibiotic resistance type',
x = '\nTotal transcripts'
) +
scale_x_reverse(limits = c(75, 0))
final_transcript_plotgrid = plot_grid(
fl_final_antibiotic_expr_plot,
pa_final_antibiotic_expr_plot,
nrow = 1,
rel_widths = c(0.6, 0.4)
)
# final combined plot of antibiotic resistance genomic potential/transcript detection
FINAL_ANTIBIOTIC_RESIST_PLOT =
plot_grid(total_ar_plot,
final_transcript_plotgrid,
ncol = 1,
labels = 'auto',
rel_heights = c(0.51, 0.49))
FINAL_ANTIBIOTIC_RESIST_PLOT
Format antibiotic resistance gene annotation and expression
data, save the data as a supplementary table, plot the genomic
potential/transcript expression
final_domain_summary = readRDS('~/Documents/cariaco-tidy-analysis/rdata-files/final_domain_summary.rds')
smc_palette = readRDS('~/Documents/cariaco-tidy-analysis/rdata-files/smc_palette.rds')
final_de_and_clust_data = readRDS('~/Documents/cariaco-tidy-analysis/rdata-files/final_de_and_clust_data.rds')
# create plot showing the most frequently occurring core/auxiliary biosynthetic genes in the Cariao MAGs
all_common_domains_plot =
final_domain_summary %>%
group_by(description) %>%
add_tally() %>%
ungroup() %>%
filter(!is.na(product)) %>%
mutate(product = if_else(product == 'PKS', 'Polyketide', product)) %>%
mutate(description = case_when(
str_detect(description,
regex('Histidine kinase-', ignore_case = TRUE)) ~ 'ATPase',
str_detect(description, regex('ketoacyl synthase', ignore_case = TRUE)) ~ 'Beta-ketoacyl synthase domain',
str_detect(description, 'RRE') ~ 'RiPP Recognition Element protein',
str_detect(description, 'Coenzyme PQQ synthesis') ~ 'PqqD-like domain',
str_detect(description, 'Radical SAM') ~ 'Radical SAM domain',
str_detect(description, 'Glycosyl transferase') ~ 'Glycosyl transferase',
str_detect(description, 'AMP-binding') ~ 'AMP-binding domain',
str_detect(description, 'Squalene') ~ 'Squalene-hopene cyclase domain',
str_detect(description, 'Acyl-CoA dehydrogenase') ~ 'Acyl-CoA dehydrogenase domain',
str_detect(description, regex(
'FAD dependent oxidoreductase|FMN-dependent oxidoreductase', ignore_case = TRUE)) ~
'FAD/FMN-dependent oxidoreductase',
str_detect(description, 'FAD binding|FAD-binding|FMN binding') ~ 'FAD/FMN binding domain',
TRUE ~ description)) %>% #view()
filter(n > 0.2 * max(n) |
description == 'FAD/FMN-dependent oxidoreductase'|
description == 'FAD/FMN binding domain') %>% #pull(description) %>% unique()
mutate(group = 'Core and additional biosynthetic genes') %>% #pull(description) %>% tabyl() %>% view()
select(c(description, product, group)) %>%
group_by(description, product, group) %>%
mutate(number = n()) %>%
ungroup() %>%
group_by(description) %>%
add_tally() %>%
ungroup() %>%
distinct() %>%
arrange(desc(n)) %>%
filter(description != 'ABC Transporter') %>%
ggplot(aes(
y = fct_inorder(description),
x = number,
fill = fct_rev(fct_infreq(factor(product, exclude = NULL))))) +
geom_bar(stat = 'identity', position = 'stack') +
theme_bw() +
facet_wrap(~ group) +
theme_bw() +
theme(
strip.text = element_text(size = 11),
axis.text = element_text(size = 9),
axis.title.y = element_blank(),
strip.background = element_rect(fill = "wheat1"),
panel.grid = element_blank(),
axis.ticks.y = element_blank(),
legend.position = c(0.9875, 0.98),
legend.justification = c(1, 0.99),
legend.background = element_blank(),
panel.border = element_rect(color = 'black', fill = NA),
legend.box.background = element_rect(color = 'black'),
legend.title = element_text(size = 11),
legend.text = element_text(size = 10)
#legend.position = 'none'
) +
scale_fill_manual(values = c(smc_palette), na.value = 'white') +
labs(
fill = 'Secondary metabolite class',
y = 'Antibiotic resistance type',
x = '\nTotal genes\n'
) +
guides(fill = guide_legend(ncol = 2))
# # create plot showing the most frequently occurring core/auxiliary biosynthetic transcripts in the PA metatranscriptomes
pa_domains_plot =
final_de_and_clust_data %>%
filter(gene_name %in% final_domain_summary$gene_name) %>%
filter(is_sig_pa_biosynthetic | (pa_counts > 0 & fl_counts == 0)) %>%
rename(raw_product = product) %>%
left_join(final_domain_summary %>%
select(c(mag_name, contig, gene_name, product, description)),
by = c('contig', 'mag_name', 'gene_name')) %>%
mutate(product = case_when(
str_detect(product, regex('hybrid', ignore_case = FALSE)) ~ 'Hybrid cluster',
product == 'Other' ~ 'Other cluster*',
product == 'PKS' ~ 'Polyketide',
TRUE ~ product)) %>%
select(-n) %>%
filter(!is.na(product)) %>%
mutate(description = case_when(
str_detect(description,
regex('Histidine kinase-', ignore_case = TRUE)) ~ 'ATPase',
str_detect(description, regex('ketoacyl synthase', ignore_case = TRUE)) ~ 'Beta-ketoacyl synthase domain',
str_detect(description, 'RRE') ~ 'RiPP Recognition Element protein',
str_detect(description, 'Coenzyme PQQ synthesis') ~ 'PqqD-like domain',
str_detect(description, 'Radical SAM') ~ 'Radical SAM domain',
str_detect(description, 'Glycosyl transferase') ~ 'Glycosyl transferase',
str_detect(description, 'AMP-binding') ~ 'AMP-binding domain',
str_detect(description, 'Squalene') ~ 'Squalene-hopene cyclase domain',
str_detect(description, 'Acyl-CoA dehydrogenase') ~ 'Acyl-CoA dehydrogenase domain',
str_detect(description, regex(
'FAD dependent oxidoreductase|FMN-dependent oxidoreductase', ignore_case = TRUE)) ~
'FAD/FMN-dependent oxidoreductase',
str_detect(description, 'FAD binding|FAD-binding|FMN binding') ~ 'FAD/FMN binding domain',
TRUE ~ description)) %>%
filter(description %in% all_common_domains_plot$data$description) %>%
mutate(group = 'Particle-associated') %>% #pull(description) %>% tabyl() %>% view()
select(c(description, product, group)) %>%
group_by(description, product, group) %>%
mutate(number = n()) %>%
ungroup() %>%
group_by(description) %>%
add_tally() %>%
ungroup() %>%
distinct() %>%
arrange(desc(n)) %>%
filter(description != 'ABC Transporter') %>%
ggplot(aes(
y = factor(description,
levels = unique(all_common_domains_plot$data$description)),
x = number,
fill = fct_rev(fct_infreq(factor(product, exclude = NULL))))) +
geom_bar(stat = 'identity', position = 'stack') +
theme_bw() +
facet_wrap(~ group) +
theme_bw() +
theme(
strip.text = element_text(size = 11),
axis.text.x = element_text(size = 9),
axis.text.y = element_blank(),
axis.title.y = element_blank(),
strip.background = element_rect(fill = "wheat1"),
panel.grid = element_blank(),
axis.ticks.y = element_blank(),
legend.position = 'none') +
scale_fill_manual(values = c(smc_palette), na.value = 'white') +
labs(
fill = 'Secondary metabolite class',
y = 'Antibiotic resistance type',
x = '\nTotal transcripts') +
scale_x_continuous(limits = c(0, 200))
# # create plot showing the most frequently occurring core/auxiliary biosynthetic transcripts in the FL metatranscriptomes
fl_domains_plot =
final_de_and_clust_data %>%
filter(gene_name %in% final_domain_summary$gene_name) %>%
filter(is_sig_fl_biosynthetic | (fl_counts > 0 & pa_counts == 0)) %>%
rename(raw_product = product) %>%
left_join(final_domain_summary %>%
select(c(mag_name, contig, gene_name, product, description)),
by = c('contig', 'mag_name', 'gene_name')) %>%
mutate(product = case_when(
str_detect(product, regex('hybrid', ignore_case = FALSE)) ~ 'Hybrid cluster',
product == 'Other' ~ 'Other cluster*',
product == 'PKS' ~ 'Polyketide',
TRUE ~ product)) %>%
select(-n) %>%
filter(!is.na(product)) %>%
mutate(description = case_when(
str_detect(description,
regex('Histidine kinase-', ignore_case = TRUE)) ~ 'ATPase',
str_detect(description, regex('ketoacyl synthase', ignore_case = TRUE)) ~ 'Beta-ketoacyl synthase domain',
str_detect(description, 'RRE') ~ 'RiPP Recognition Element protein',
str_detect(description, 'Coenzyme PQQ synthesis') ~ 'PqqD-like domain',
str_detect(description, 'Radical SAM') ~ 'Radical SAM domain',
str_detect(description, 'Glycosyl transferase') ~ 'Glycosyl transferase',
str_detect(description, 'AMP-binding') ~ 'AMP-binding domain',
str_detect(description, 'Squalene') ~ 'Squalene-hopene cyclase domain',
str_detect(description, 'Acyl-CoA dehydrogenase') ~ 'Acyl-CoA dehydrogenase domain',
str_detect(description, regex(
'FAD dependent oxidoreductase|FMN-dependent oxidoreductase', ignore_case = TRUE)) ~
'FAD/FMN-dependent oxidoreductase',
str_detect(description, 'FAD binding|FAD-binding|FMN binding') ~ 'FAD/FMN binding domain',
TRUE ~ description)) %>%
filter(description %in% all_common_domains_plot$data$description) %>%
mutate(group = 'Free-living') %>% #pull(description) %>% tabyl() %>% view()
select(c(description, product, group)) %>%
group_by(description, product, group) %>%
mutate(number = n()) %>%
ungroup() %>%
group_by(description) %>%
add_tally() %>%
ungroup() %>%
distinct() %>%
arrange(desc(n)) %>%
filter(description != 'ABC Transporter') %>%
ggplot(aes(
y = factor(description,
levels = unique(all_common_domains_plot$data$description)),
x = number,
fill = fct_rev(fct_infreq(factor(product, exclude = NULL))))) +
geom_bar(stat = 'identity', position = 'stack') +
theme_bw() +
facet_wrap(~ group) +
theme_bw() +
theme(
strip.text = element_text(size = 11),
axis.text = element_text(size = 9),
axis.title.y = element_blank(),
strip.background = element_rect(fill = "wheat1"),
panel.grid = element_blank(),
axis.ticks.y = element_blank(),
legend.position = 'none') +
scale_fill_manual(values = c(smc_palette), na.value = 'white') +
labs(
fill = 'Secondary metabolite class',
y = 'Antibiotic resistance type',
x = '\nTotal transcripts') +
scale_x_reverse(limits = c(200, 0))
# combine the PA and FL transcript expression bar plots
final_domains_plotgrid = plot_grid(
fl_domains_plot,
pa_domains_plot,
nrow = 1,
rel_widths = c(0.65, 0.35)
)
# combine the expression barplots with the genomic potential bar plot to create the final plot
FINAL_ALL_DOMAINS_PLOT =
plot_grid(all_common_domains_plot,
final_domains_plotgrid,
ncol = 1,
labels = 'auto',
rel_heights = c(0.51, 0.49))
FINAL_ALL_DOMAINS_PLOT
## Differential gene expression analysis
Prepare gene expression matrix for DESeq2 differential expression analysis
# deseq2 ALL PA DEPTHS VS ALL FL DEPTHS run on minimap2 counts files -- 95% min percent identity of alignments at least 50% of read length aligned --------
# set working dir to the one containing .paf files from metatranscriptomic read-mapping to all cariaco MAG genes
setwd('/vortexfs1/omics/pachiadaki/dgellermcgrath/cariaco-work-051121/antismash6_cariaco_rdata/FINAL_PRODIGAL_AND_ANTISMASH_OUTPUTS_012422/minimap2_metaT_mapping_final_results_used/')
files = system('ls *.paf', intern = TRUE)
sample_names = str_replace(files, '.paf', '')
# load list of tibbles - each tibble containing the read-mapping counts to genes from each metatranscriptomic sample
mapped_read_counts = readRDS('rdata/list_of_mapped_metaT_reads_from_each_sample.rds')
# join the tibbles together
mapped_read_counts =
reduce(mapped_read_counts, left_join, by = 'gene_name')
# rename samples to use sample names used in the paper
mapped_read_counts =
mapped_read_counts %>%
rename_with(
~ sample_names,
.cols = 2:last_col()
)
mapped_read_counts =
mapped_read_counts %>%
mutate(gene_name = str_replace(
gene_name, '(MAG_\\d+)_(\\d+_\\d+)', '\\1-ctg\\2')) %>%
as.data.frame() %>%
column_to_rownames(var = 'gene_name')
# create column metadata for use with the DESeq() function
col_data = tibble(sample = colnames(mapped_read_counts)) %>%
mutate(fraction = if_else(str_detect(sample, 'PA'), 'PA', 'FL'),
depth = str_replace(sample, '[:ALPHA:]{3}(\\d+)_\\d', '\\1'),
depth = as.integer(depth),
layer = case_when(depth == 900 ~ 'euxinic',
depth >= 103 & depth <= 237 ~ 'oxycline',
depth > 237 & depth < 900 ~ 'shallow.anoxic'),
group = paste0(fraction, '_', layer)) %>%
select(-c(fraction, depth, layer)) %>%
mutate(group = str_replace(group, '(.*)_.*', '\\1')) %>%
mutate(group = as_factor(group))
all(colnames(mapped_read_counts) == col_data$sample) # TRUE
# load a tibble containing the length in bp of each gene from all Cariaco MAGs
gl = readRDS('/vortexfs1/omics/pachiadaki/dgellermcgrath/cariaco-work-051121/antismash6_cariaco_rdata/FINAL_PRODIGAL_AND_ANTISMASH_OUTPUTS_012422/rdata/gene_lengths_tibble.rda')
gl = gl %>%
mutate(gene_length = gene_length / 1000) # convert length unit to kilobases
logTransformed_mapping_tpm = mapped_read_counts %>%
rownames_to_column(var = 'gene_name') %>%
as_tibble() %>%
left_join(gl, by = 'gene_name') %>%
relocate(gene_length, .after = 1) %>%
mutate(across(3:last_col(), ~ .x / gene_length))
get_tpm_for = function(col) {
scaling_factor = sum(col) / 1e6
col = col / scaling_factor
return(col)
}
logTransformed_mapping_tpm = logTransformed_mapping_tpm %>%
select(-gene_length) %>%
mutate(across(2:last_col(), ~ get_tpm_for(.x))) %>%
mutate(across(2:last_col(), ~ log1p(.x)))
bgc = vroom::vroom(
'/vortexfs1/omics/pachiadaki/dgellermcgrath/cariaco-work-051121/antismash6_cariaco_rdata/FINAL_PRODIGAL_AND_ANTISMASH_OUTPUTS_012422/salmon_related_data_files/bgc_genes_from_10kb_clusters.txt',
show_col_types = FALSE)
bgc = bgc %>%
mutate(gene_name = str_replace(
gene_name,
'(^ctg\\d+_\\d+)-(MAG.*)', '\\2-\\1'
))
logTransformed_mapping_tpm = logTransformed_mapping_tpm %>%
filter(gene_name %in% bgc$gene_name)
#saveRDS(logTransformed_mapping_tpm, file = 'rdata/logTransformed_tpm_for_bgc_genes_minimap2_res.rds')
da_dds <- DESeq2::DESeqDataSetFromMatrix(mapped_read_counts,
colData = col_data,
design = ~ group)
register(MulticoreParam(40))
da_final_dds <- DESeq2::DESeq(da_dds, parallel = TRUE) # run the DESeq2 statistical model which assumes a negative binomial distribution for
# the RNA-seq count matrix data
#estimating size factors
#estimating dispersions
#gene-wise dispersion estimates: 40 workers
#mean-dispersion relationship
#final dispersion estimates, fitting model and testing: 40 workers
#-- replacing outliers and refitting for 10694 genes
#-- DESeq argument 'minReplicatesForReplace' = 7
#-- original counts are preserved in counts(dds)
#estimating dispersions
#fitting model and testing
# pull Wald test results comparing the oxycline PA to FL fractions
res <- DESeq2::results(da_final_dds,
contrast = c('group',
'PA',
'FL'),
alpha = 0.05)
final_res <- res %>%
as.data.frame() %>%
rownames_to_column(var = 'mag_name') %>%
as_tibble() %>%
#filter(!is.na(padj),
# padj < 0.05) %>%
mutate(enrichment = case_when(
log2FoldChange > 0 & padj < 0.05 ~ 'particle enriched',
log2FoldChange < 0 & padj < 0.05 ~ 'free-living enriched',
padj > 0.05 | is.na(padj) ~ 'no significant difference'))
table(final_res$enrichment)
#free-living enriched no significant difference particle enriched
#48868 1514220 83359
col_metadata = tibble(sample = colnames(mapped_read_counts)[
2:ncol(mapped_read_counts)
]) %>%
mutate(fraction = if_else(str_detect(sample, 'PA'), 'PA', 'FL'),
depth = str_replace(sample, '[:ALPHA:]{3}(\\d+)_\\d', '\\1'),
depth = as.integer(depth),
layer = case_when(depth == 900 ~ 'euxinic',
depth >= 103 & depth <= 237 ~ 'oxycline',
depth > 237 & depth < 900 ~ 'shallow.anoxic'))
mapped_read_counts = mapped_read_counts %>%
rownames_to_column(var = 'gene_name') %>%
as_tibble()
cs = mapped_read_counts %>%
#as.data.frame() %>%
#column_to_rownames(var = 'gene_name') %>%
#rownames_to_column(var = 'gene_name') %>%
#as_tibble() %>%
mutate(mag_name = str_replace(gene_name, '(MAG_\\d+)-.*', '\\1'), .before = 1)
cs = cs %>%
mutate(
fl_counts = rowSums(select(., col_metadata %>%
filter(fraction == 'FL') %>%
pull(sample))),
pa_counts = rowSums(select(., col_metadata %>%
filter(fraction == 'PA') %>%
pull(sample)))
) %>%
relocate(c(fl_counts, pa_counts), .after = gene_name) %>%
select(1:8)
#bgc = readRDS('/vortexfs1/omics/pachiadaki/dgellermcgrath/cariaco-work-051121/antismash6_cariaco_rdata/FINAL_PRODIGAL_AND_ANTISMASH_OUTPUTS_012422/rdata/bgc_kb_5000_gene_data.rda')
bgc_uniq = bgc %>%
select(mag_name, gene_name) %>%
distinct() %>%
mutate(is_biosynthetic = TRUE)
sig_genes = res %>%
as.data.frame() %>%
rownames_to_column(var = 'gene_name') %>%
as_tibble() %>%
left_join(bgc_uniq %>%
select(-mag_name),
by = 'gene_name')
#pa_sig
sig_genes %>%
filter(is_biosynthetic, log2FoldChange > 0, padj <= 0.05) %>%
mutate(is_sig_pa_biosynthetic = TRUE) %>%
select(gene_name, is_sig_pa_biosynthetic)
#fl_sig
sig_genes %>%
filter(is_biosynthetic, log2FoldChange < 0, padj <= 0.05) %>%
mutate(is_sig_fl_biosynthetic = TRUE) %>%
select(gene_name, is_sig_fl_biosynthetic)
clust_analysis = pmap_df(list(
list(sig_genes),
list(sig_genes %>%
filter(is_biosynthetic, log2FoldChange > 0, padj <= 0.05) %>%
mutate(is_sig_pa_biosynthetic = TRUE) %>%
select(gene_name, is_sig_pa_biosynthetic)),
list(sig_genes %>%
filter(is_biosynthetic, log2FoldChange < 0, padj <= 0.05) %>%
mutate(is_sig_fl_biosynthetic = TRUE) %>%
select(gene_name, is_sig_fl_biosynthetic)),
list(
cs %>% select(2:4) #%>% rename(fl_counts = 2, pa_counts = 3)#,
#cs %>% select(c(2, 5:6)) %>% rename(fl_counts = 2, pa_counts = 3),
#cs %>% select(c(2, 7:8)) %>% rename(fl_counts = 2, pa_counts = 3)
)
), ~
..1 %>%
left_join(..2, by = 'gene_name') %>%
left_join(..3, by = 'gene_name') %>%
left_join(..4, by = 'gene_name')
)
clust_data_pa = clust_analysis %>%
filter(is_sig_pa_biosynthetic |
(pa_counts > 0 & fl_counts == 0 & is_biosynthetic)
) %>%
relocate(c(fl_counts, pa_counts, padj,
is_sig_pa_biosynthetic,
is_sig_fl_biosynthetic,
is_biosynthetic), .after = gene_name) %>%
left_join(bgc, by = 'gene_name')
clust_data_fl = clust_analysis %>%
filter(is_sig_fl_biosynthetic |
(fl_counts > 0 & pa_counts == 0 & is_biosynthetic)
) %>%
relocate(c(fl_counts, pa_counts, padj,
is_sig_pa_biosynthetic,
is_sig_fl_biosynthetic,
is_biosynthetic), .after = gene_name) %>%
left_join(bgc, by = 'gene_name')
clust_data_pa %>% dim()
clust_data_fl %>% dim()
bgc_summary = bgc %>%
group_by(contig, region) %>%
summarize(n = n(), .groups = 'drop')
pa_expressed = clust_data_pa %>%
left_join(bgc_summary, by = c('contig', 'region')) %>%
group_by(contig, region) %>%
rename(cluster_size = n) %>%
add_tally() %>%
ungroup() %>%
mutate(proportion_expressed = n / cluster_size)
pa_expressed_summary = pa_expressed %>%
select(product, proportion_expressed,
contig, region, cluster_size, n) %>%
distinct()
fl_expressed = clust_data_fl %>%
left_join(bgc_summary, by = c('contig', 'region')) %>%
group_by(contig, region) %>%
rename(cluster_size = n) %>%
add_tally() %>%
ungroup() %>%
mutate(proportion_expressed = n / cluster_size)
fl_expressed_summary = fl_expressed %>%
select(product, proportion_expressed,
contig, region, cluster_size, n) %>%
distinct()
####
map_chr(ls(), ~ paste0(.x, ': ', object.size(get(.x)) %>% format(units = "Mb")))
#save(list = ls(), file = 'rdata/deseq2_full_analysis_with_minimap2_metaT_alignments.rdata')
final_de_and_clust_data = pa_expressed %>%
bind_rows(fl_expressed)
#saveRDS(final_de_and_clust_data, file = 'rdata/final_de_and_uniq_expr_clust_data.rds')
test_final_de = readRDS('~/Downloads/final_de_and_uniq_expr_clust_data.rds')
#PA
test_final_de %>%
filter(
(padj < 0.05 & log2FoldChange > 0) |
(pa_counts > 0 & fl_counts == 0)) %>%
left_join(
mag_table %>%
select(mag_name, phylum, class),
by = 'mag_name') %>%
pull(phylum) %>%
table() %>%
sort()
# Omnitrophota, Desulfobacterota, Planctomycetota,
# Myxococcota, Proteobacteria (Gammaproteos)
# hglE-KS arylpolyene T1PKS
# 153 175 205
# ladderane ranthipeptide NRPS-like
# 261 320 334
# RRE-containing terpene
# 371 414
#FL
test_final_de %>%
filter(
(padj < 0.05 & log2FoldChange < 0) |
(fl_counts > 0 & pa_counts == 0)) %>%
left_join(
mag_table %>%
select(mag_name, phylum, class),
by = 'mag_name') %>%
pull(phylum) %>%
table() %>%
sort()
# Proteobacteria (Alphaproteos), Omnitrophota, SAR324,
# Desulfobacterota
# T1PKS ladderane
# 88 112
# phosphonate RRE-containing
# 130 135
# ranthipeptide betalactone
# 148 283
# NRPS-like terpene
# 291 348
# 25 genes DE'd in PA w/ padj < 0.05
# 138 genes DE'd in FL w/ padj < 0.05
test_final_de %>%
filter(pa_counts > 0 & fl_counts == 0) %>%
nrow()
test_final_de %>%
filter(fl_counts > 0 & pa_counts == 0) %>%
nrow()
## Barplots of Bacterial and Archaeal MAGs recovered
# mag phlyum dists/phylum avg completeness --------------------------------
sm_key = readRDS('~/Documents/cariaco-tidy-analysis/rdata-files/sm_key.rds')
heat_colors <- c(colorRampPalette(colors = 'white')(1),
colorRampPalette(colors = c('#ABD9E9',
'#FEE090',
'orange',
'#D73027'))(8000))
sm_summary <- sm_key %>%
select(mag_name, cluster, sm_class) %>%
distinct() %>%
left_join(mag_table, by = 'mag_name')
# plot used
mag_dist_plot_archaea <- mag_table %>%
group_by(phylum) %>%
summarize(freq = n()) %>%
arrange(desc(freq)) %>%
mutate(x = row_number(),
phylum) %>%
left_join(mag_table %>%
select(phylum, domain, completeness) %>%
group_by(phylum) %>%
add_tally(mean(completeness), name = 'average_completeness') %>%
select(-completeness) %>%
ungroup() %>%
distinct(),
by = 'phylum') %>%
mutate(phylum = if_else(is.na(phylum), 'Unclassified', phylum)) %>%
mutate(phylum = as_factor(phylum)) %>%
filter(domain == 'Archaea') %>%
ggplot(aes(phylum, freq)) +
theme_bw() +
theme(panel.grid = element_blank(),
axis.text.x = element_text(angle = 65, hjust = 1, size = 8),
#plot.title = element_text(size = 17),
axis.title.x = element_text(size = 10),
axis.title.y = element_blank(),
strip.text = element_text(size = 10),
strip.background = element_rect(fill = 'wheat1'),
#legend.title = element_text(size = 16),
#legend.direction = 'horizontal',
#legend.margin = margin(c(4,15,4,4)),
#legend.text = element_text(size = 13),
axis.text.y = element_text(size = 8),
legend.position = 'none'
#legend.position = c(0.8075, 0.91),
#tagger.panel.tag.background = element_rect(fill = 'wheat1'),
#tagger.panel.tag.text = element_text(face = 'bold'),
#legend.background = element_rect(fill = "white",
# color = "black")
) +
geom_segment(aes(xend = phylum, yend = 0), color = 'grey50') +
geom_point(size = 1.5,
aes(color = average_completeness)) +
geom_text(aes(label = freq, vjust = -0.65), size = 2.5) +
facet_grid(~ domain, scales = 'free', space = 'free') +
labs(x = 'Phylum',
y = 'Number of MAGs per phylum\n',
color = 'Average genome
completeness per phylum',
#title = 'Taxonomic distribution of recovered high-quality MAGs (by phylum, n = 568)'
) +
scale_color_gradientn(colors = heat_colors,
breaks = seq(75, 100, by = 5),
limits = c(79, 100)
) +
scale_y_continuous(limits = c(0, 75),
expand = expansion(add = c(1, 0)))# +
#tag_facets(position = 'tl')
mag_dist_plot_bacteria =
mag_table %>%
filter(!mag_name %in% c('MAG_1507', 'MAG_769', 'MAG_983')) %>%
group_by(phylum) %>%
summarize(freq = n()) %>%
arrange(desc(freq)) %>%
mutate(x = row_number(),
phylum) %>%
left_join(mag_table %>%
select(phylum, domain, completeness) %>%
group_by(phylum) %>%
add_tally(mean(completeness), name = 'average_completeness') %>%
select(-completeness) %>%
ungroup() %>%
distinct(),
by = 'phylum') %>%
mutate(phylum = if_else(is.na(phylum), 'Unclassified', phylum)) %>%
mutate(phylum = as_factor(phylum)) %>%
filter(domain == 'Bacteria') %>%
ggplot(aes(phylum, freq)) +
theme_bw() +
theme(panel.grid = element_blank(),
axis.text.x = element_text(angle = 65, hjust = 1, size = 8),
#plot.title = element_text(size = 17),
axis.title = element_text(size = 10),
strip.text = element_text(size = 10),
strip.background = element_rect(fill = 'wheat1'),
legend.title = element_text(size = 8),
legend.direction = 'horizontal',
legend.margin = margin(c(4,15,4,4)),
legend.text = element_text(size = 8),
axis.text.y = element_text(size = 8),
legend.position = c(0.71, 0.87),
#axis.title.y = element_blank(),
#tagger.panel.tag.background = element_rect(fill = 'wheat1'),
#tagger.panel.tag.text = element_text(face = 'bold'),
legend.background = element_rect(fill = "white",
color = "black")
) +
geom_segment(aes(xend = phylum, yend = 0), color = 'grey50') +
geom_point(size = 1.5,
aes(color = average_completeness)) +
geom_text(aes(label = freq, vjust = -0.65), size = 2.5) +
facet_grid(~ domain, scales = 'free', space = 'free') +
labs(x = 'Phylum',
y = 'Number of MAGs per phylum\n',
color = 'Average genome
completeness per phylum',
#title = 'Taxonomic distribution of recovered high-quality MAGs (by phylum, n = 568)'
) +
scale_color_gradientn(colors = heat_colors,
breaks = seq(75, 100, by = 5),
limits = c(79, 100)
) +
scale_y_continuous(limits = c(0, 75),
expand = expansion(add = c(1, 0)))
mag_dist_plot = plot_grid(mag_dist_plot_bacteria, mag_dist_plot_archaea,
labels = c('a', 'b'),
rel_widths = c(4, 1),
nrow = 1,
label_size = 11.5,
align = 'hv',
label_x = c(0.045, 0.205)
)
mag_dist_plot
# tiff(filename = '~/Documents/cariaco-tidy-analysis/FINAL_FIGURES_DIR/mag_hq_frequency_plots_by_phylum_split.tiff',
# width = 7, height = 4, units = 'in',
# res = 300)
# print(mag_dist_plot)
# dev.off()
#ggsave(
# filename = '~/Documents/cariaco-tidy-analysis/FINAL_FIGURES_DIR/mag_hq_frequency_plots_by_phylum_split.svg',
# width = 7,
# height = 4,
# units = 'in'
#)
Plot of biosynthetic cluster potential normalized by MAG
phylum
# BGC frequency plot by phylum --------------------------------------------
bgc_count_data_summary = readRDS('~/Documents/cariaco-tidy-analysis/rdata-files/bgc_count_data_summary.rds')
smc = readRDS('~/Documents/cariaco-tidy-analysis/rdata-files/smc.rds')
highlight <- function(x, pat, color = "black", family = "") {
ifelse(grepl(pat, x), glue("<b style='font-family:{family}; color:{color}'>{x}</b>"), x)
}
face2 = bgc_count_data_summary %>%
select(phylum, domain, n_mags_in_phylum) %>%
distinct() %>%
filter(n_mags_in_phylum == 1) %>%
mutate(type = 'bold')
#WORKS NOW - MORE PHYLA HAVE ONLY ONE MEMBER - ADJUST FOR THAT ACCORDINGLY - BOLDING THEIR LABELS ON X AXIS
bact_final_sm_freq_plot = bgc_count_data_summary %>%
filter(domain == 'Bacteria') %>%
ggplot(aes(as_factor(phylum), norm_sm_freq,
fill = fct_rev(fct_infreq(product)))) +
geom_bar(stat = 'identity') +
theme_bw() +
theme(axis.text.x = element_text(angle = 70, hjust = 1, size = 10),
axis.text.y = element_text(size = 10),
axis.title = element_text(size = 12),
legend.position = c(0.98, 0.98),
legend.justification = c(0.98, 0.98),
legend.text = element_text(size = 11),
legend.title = element_text(size = 12),
legend.background = element_blank(), panel.border =
element_rect(color = 'black', fill = NA),
legend.box.background = element_rect(color = 'black'),
panel.grid = element_blank(),
strip.text = element_text(size = 13),
axis.title.y = element_blank(),
strip.background = element_rect(fill = 'wheat1')
) +
labs(x = '\nPhylum',
fill = 'Secondary metabolite class') +
scale_fill_manual(values = smc) +
facet_grid(~ domain, space = 'free', scales = 'free') +
scale_x_discrete(labels = function(x) highlight(x, paste0(face2$phylum, collapse = '|'), 'black')) +
theme(axis.text.x = element_markdown())# +
arch_final_sm_freq_plot = bgc_count_data_summary %>%
filter(domain == 'Archaea') %>%
ggplot(aes(as_factor(phylum), norm_sm_freq,
fill = fct_rev(fct_infreq(product)))) +
geom_bar(stat = 'identity') +
theme_bw() +
theme(axis.text.x = element_text(angle = 70, hjust = 1),
axis.text = element_text(size = 10),
axis.title = element_text(size = 12),
legend.position = 'none',
panel.grid = element_blank(),
strip.text = element_text(size = 13),
axis.title.y = element_blank(),
strip.background = element_rect(fill = 'wheat1')
) +
labs(x = '\nPhylum',
fill = 'Secondary metabolite class') +
scale_fill_manual(values = smc) +
facet_grid(~ domain, space = 'free', scales = 'free') +
scale_x_discrete(labels = function(x) highlight(x, paste0(face2$phylum, collapse = '|'), 'black')) +
theme(axis.text.x = element_markdown())
#
# save the sm biosynthetic potential plot
sm_freq_plot = plot_grid(bact_final_sm_freq_plot, NULL, arch_final_sm_freq_plot,
labels = c('a', '', 'b'),
rel_widths = c(3, 0.02, 1),
nrow = 1,
align = 'hv'#,
#label_x = c(0.0975, 0, 0)
)
sm_freq_plot
#ggsave(
# sm_freq_plot,
# filename = '~/Documents/cariaco-tidy-analysis/FINAL_FIGURES_DIR/sm_normalized_freq_plot_by_phylum.svg',
# width = 7,
# height = 7,
# units = 'in'
#)
## Processing and of RNA-Seq mapping results for biosynthetic
transcript expression plotting, and creating a heatmap plot of
results
mm2_final = readRDS('~/Documents/cariaco-tidy-analysis/rdata-files/logTransformed_tpm_for_bgc_genes_minimap2_res.rds')
metaT_col_order = tibble(
sample_name = colnames(mm2_final)[2:ncol(mm2_final)],
fraction = if_else(
str_detect(sample_name, 'PA'), 'PA', 'FL'),
depth = str_replace(
sample_name, '.*(\\d{3}).*', '\\1')
) %>%
arrange(desc(fraction), depth) %>%
pull(sample_name)
# create color specifications for column annotations
anno_colors = list(
Fraction = c(
'PA' = 'darkgreen',
'FL' = 'darkseagreen2'),
Layer = c(
'Oxycline' = 'darkslategray3', #'wheat2',
'Shallow anoxic' = 'gold',# 'khaki2',
'Euxinic' = 'sienna3'),
Season = c('May' = 'slategray',
'November' = 'slategray1'),
`Depth (m)` = c(
'103' = 'grey90',
'148' = 'grey85',
'198' = 'grey80',
'200' = 'grey75',
'234' = 'grey70',
'237' = 'grey65',
'247' = 'grey60',
'267' = 'grey55',
'295' = 'grey50',
'314' = 'grey45',
'900' = 'grey20'
)
)
# final plot used in manuscript
mm2_final_mod =
mm2_final %>%
mutate(group_col = str_replace(
gene_name, '(MAG_\\d+-ctg\\d+)_.*', '\\1'), .before = 1) %>%
relocate(group_col) %>%
mutate(rowSums = rowSums(.[3:ncol(.)]), .after = 1) %>%
mutate(rs_pa = rowSums(.[str_detect(colnames(.), 'PA')]),
rs_fl = rowSums(.[str_detect(colnames(.), 'FL')])) %>%
relocate(c(rs_pa, rs_fl), .after = 1) %>%
filter(
rowSums > 19
) %>%
group_by(group_col) %>%
mutate(group_PA_rowSums = sum(rs_pa), .after = 1) %>%
mutate(group_FL_rowSums = sum(rs_fl), .after = 1) %>%
mutate(sorting_col = if_else(
group_PA_rowSums > group_FL_rowSums, 'PA', 'FL'), .after = group_PA_rowSums) %>%
ungroup() %>%
arrange(desc(group_PA_rowSums), sorting_col) %>%
select(-c(rowSums, group_PA_rowSums, group_FL_rowSums,
rs_pa, rs_fl, group_col, sorting_col)) %>%
column_to_rownames(var = 'gene_name') %>%
#relocate(all_of(pheat_col_order)) %>%
relocate(all_of(metaT_col_order))
mm2_final_mod =
mm2_final_mod %>%
mutate(pa_sums = rowSums(.[str_detect(colnames(.), 'PA')]),
fl_sums = rowSums(.[str_detect(colnames(.), 'FL')]) ) %>%
relocate(c(pa_sums, fl_sums), .before = 1) %>%
#as_tibble() %>%
mutate(row_sorter = if_else(pa_sums > fl_sums, TRUE, FALSE), .before = 1) %>%
arrange(desc(row_sorter), desc(pa_sums)) %>%
select(-c(pa_sums, fl_sums, row_sorter)) %>%
#mutate(across(everything(), ~ expm1(.x))) %>%
relocate(colnames(.) %>%
{function(x) {
tbl = tibble(
mpa = x[str_detect(x, 'MPA')],
npa = x[str_detect(x, 'NPA')],
mfl = x[str_detect(x, 'MFL')],
nfl = x[str_detect(x, 'NFL')]
) %>%
mutate(groupper = rep(paste0('group_', seq(1, 6, by = 1)), each = 2)) %>%
group_by(groupper) %>%
summarize(across(everything(), ~
paste0(.x, collapse = ';')),
.groups = 'drop') %>%
select(-groupper)
pa = as.vector(rbind(tbl$mpa, tbl$npa))
fl = as.vector(rbind(tbl$mfl, tbl$nfl))
g = c(pa, fl)
g = str_split(g, pattern = ';') %>% unlist()
return(g)}}())
metaT_row_anno = tibble(
gene_name = rownames(mm2_final_mod),
mag_name = str_replace(gene_name, '(MAG_\\d+)-.*', '\\1')
) %>%
column_to_rownames(var = 'gene_name')
# create column annotation information object
col_anno = tibble(
sample_name = colnames(mm2_final_mod),
Depth = str_replace(sample_name, '.*(\\d{3}).*', '\\1'),
Fraction = if_else(str_detect(sample_name, 'PA'), 'PA', 'FL'),
Season = if_else(str_detect(sample_name, 'M'), 'May', 'November')
) %>%
mutate(Depth = as.integer(Depth),
Layer = case_when(
Depth < 247 ~ 'Oxycline',
Depth >= 247 & Depth < 900 ~ 'Shallow anoxic',
TRUE ~ 'Euxinic'
)) %>%
mutate(Depth = as.character(Depth)) %>%
rename(`Depth (m)` = Depth) %>%
column_to_rownames(var = 'sample_name')
mm2_final_plot =
mm2_final_mod %>%
rownames_to_column(var = 'gene_name') %>%
mutate(mag_name = str_replace(
gene_name, '(MAG_\\d+)-.*', '\\1')) %>%
filter(!mag_name %in% c('MAG_1507', 'MAG_769', 'MAG_983')) %>%
select(-mag_name) %>%
column_to_rownames(var = 'gene_name') %>%
as.matrix() %>%
ComplexHeatmap::pheatmap(
name = 'Transcripts per million',
show_colnames = FALSE,
annotation_col = col_anno,
#annotation_row = metaT_row_anno,
annotation_colors = anno_colors,
gaps_col = 24,
clustering_method = 'mcquitty',
#cluster_rows = FALSE,
cluster_cols = FALSE,
show_rownames = FALSE,
border_color = 'grey60',
treeheight_row = 0,
treeheight_col = 0,
color =
c(#colorRampPalette(colors = 'white')(100),
colorRampPalette(colors = c(
'gray98',
'#aae6fa',
'#c9e6f0',
'#f7e96d',
'#f7e43b',
'goldenrod1',
'orange',
'#ff6a00',
'#D73027',
'purple4'))(1000)
),
legend_breaks = seq(0, 6, by = 2),
legend_labels = gt_render(c(
'0',
'6.39 x 10^0',
'5.36 x 10^1',
'4.02 x 10^2')),
show_row_dend = FALSE,
show_column_dend = FALSE
)
mm2_final_plot = ComplexHeatmap::draw(
mm2_final_plot,
merge_legend = TRUE,
heatmap_legend_side = 'right',
annotation_legend_side = 'right'
)
# tiff(filename = '~/Documents/cariaco-tidy-analysis/FINAL_FIGURES_DIR/final_main_paper_transcript_heatmap.tiff',
# width = 7,
# height = 10,
# units = 'in',
# res = 300)
# mm2_final_plot
# dev.off()
#svg(filename = '~/Documents/cariaco-tidy-analysis/FINAL_FIGURES_DIR/final_main_paper_transcript_heatmap.svg',
# width = 7,
# height = 10#,
# #units = 'in',
# #res = 300
#)
#mm2_final_plot
#dev.off()
## Pie chart of recovered BGCs >= 10kb in total length;
process raw results, shape data into format for pie chart creation;
create and save the pie chart
bgc_summary = readRDS('~/Documents/cariaco-tidy-analysis/rdata-files/bgc_summary.rds')
bgc_10k_kb = bgc_summary %>%
mutate(size = end - start, .after = contig) %>%
filter(size >= 10000)
bgc_count_data = bgc_10k_kb %>%
mutate(mag_name = str_replace(contig, '(MAG_\\d+)_.*', '\\1'), .before = contig) %>%
filter(!mag_name %in% c('MAG_1507', 'MAG_769', 'MAG_983')) %>% # remove MAGs that are from laboratory contamination
select(-mag_name) %>%
mutate(clust = paste0('clust_', 1:n()), .before = size) %>%
group_by(contig) %>%
group_split() %>%
map_dfr(~ .x %>%
mutate(ID.match = map2(
1:length(start), list(.), ~
.y %>%
filter(
start > start[.x] & start < end[.x] | # start of the BGC is greater than your start, and less than your end
start < start[.x] & end > start[.x] | # start of the BGC is less than your start, and end is greater than your start
start == start[.x] & start < end[.x]) %>% # start of the BGC is equal to your start, and less than your end
pull(clust)),
.before = start)) %>%
group_by(contig) %>%
mutate(concat = list(unlist(ID.match)), .after = ID.match) %>%
mutate(concat = map(concat, ~ unique(.x[duplicated(.x)]))) %>%
mutate(length_concat = length(concat), .after = concat) %>%
mutate(n = n(), .after = length_concat) %>%
ungroup()
all(bgc_count_data$length_concat == bgc_count_data$n) # TRUE; all contigs w/ multiple BGCs have some level of overlap amongst them; can classify these all as 'hybrid clusters' of various sorts
## [1] TRUE
bgc_count_data = bgc_count_data %>%
select(-c(ID.match, concat, length_concat)) %>%
group_by(contig) %>%
mutate(is_hybrid = if_else(n() > 1, TRUE, FALSE), .after = contig)
bgc_count_data_summary = bgc_count_data %>%
ungroup() %>%
select(-detection_rule) %>%
mutate(product = str_replace(product, '-like', ''),
product = str_replace(product, '.*PKS', 'PKS')) %>%
arrange(contig, product) %>%
group_by(contig, is_hybrid) %>%
select(-c(filename, filepath)) %>%
summarize(across(everything(), ~ paste0(.x, collapse = ';')), .groups = 'drop') %>%
mutate(product = if_else(
str_detect(product, ';'), str_replace(product, '(.*)', '\\1 hybrid'), product),
product = str_replace_all(product, ';', '-')
)
bgc_count_data_summary = bgc_count_data_summary %>%
mutate(
product = case_when(
!is_hybrid & str_detect(product, 'lanthipeptide|RRE-containing|thiopeptide|proteusin|lassopeptide|sactipeptide|ranthipeptide|thioamitides') ~ 'RiPP',
!is_hybrid & str_detect(product, 'lactone') ~ 'Lactone',
!is_hybrid & str_detect(product, 'acyl_amino_acids|nucleoside|PBDE|indole|siderophore|resorcinol|ectoine|NAPAA|CDPS|LAP|redox-cofactor') ~ 'other',
!is_hybrid & str_detect(product, 'hglE-KS') ~ 'PKS',
!is_hybrid & str_detect(product, 'arylpolyene') ~ 'Aryl polyene',
!is_hybrid & str_detect(product, 'redox-cofactor') ~ 'Redox-cofactor',
#
is_hybrid & str_detect(product, 'NRPS') & str_detect(product, 'PKS|hglE-KS') ~ 'NRPS-PKS hybrid',
is_hybrid & str_detect(product, 'NRPS') & !str_detect(product, 'PKS|hglE-KS|lanthipeptide|RRE|thiopeptide|proteusin|lassopeptide|sactipeptide|ranthipeptide|thioamitides|RiPP') ~ 'NRPS hybrid',
is_hybrid & str_detect(product, 'lanthipeptide|RRE|thiopeptide|proteusin|lassopeptide|sactipeptide|ranthipeptide|thioamitides|RiPP') & !str_detect(product, 'PKS|NRPS|hglE-KS') ~ 'RiPP hybrid',
is_hybrid & str_detect(product, 'PKS|hglE-KS') & !str_detect(product, 'NRPS|lanthipeptide|RRE|thiopeptide|proteusin|lassopeptide|sactipeptide|ranthipeptide|thioamitides|RiPP') ~ 'PKS hybrid',
is_hybrid & str_detect(product, 'NRPS') & str_detect(product, 'lanthipeptide|RRE|thiopeptide|proteusin|lassopeptide|sactipeptide|ranthipeptide|thioamitides|RiPP') & !str_detect(product, 'PKS|hglE-KS') ~ 'NRPS-RiPP hybrid',
is_hybrid & str_detect(product, 'PKS|hglE-KS') & str_detect(product, 'lanthipeptide|RRE|thiopeptide|proteusin|lassopeptide|sactipeptide|ranthipeptide|thioamitides|RiPP') & !str_detect(product, 'NRPS') ~ 'PKS-RiPP hybrid',
is_hybrid & str_detect(product, 'terpene') & !str_detect(product, 'lanthipeptide|RRE|thiopeptide|proteusin|lassopeptide|sactipeptide|ranthipeptide|thioamitides|RiPP|PKS|NRPS|hglE-KS') ~ 'Terpene hybrid',
is_hybrid & str_detect(product, 'ladderane') & !str_detect(product, 'lanthipeptide|RRE|thiopeptide|proteusin|lassopeptide|sactipeptide|ranthipeptide|thioamitides|RiPP|PKS|NRPS|hglE-KS|terpene') ~ 'Ladderane hybrid',
is_hybrid & str_detect(product, 'arylpolyene') & !str_detect(product, 'lanthipeptide|RRE|thiopeptide|proteusin|lassopeptide|sactipeptide|ranthipeptide|thioamitides|RiPP|PKS|NRPS|hglE-KS|terpene') ~ 'Aryl polyene hybrid',
is_hybrid & str_detect(product, 'lactone') & !str_detect(product, 'lanthipeptide|RRE|thiopeptide|proteusin|lassopeptide|sactipeptide|ranthipeptide|thioamitides|RiPP|PKS|NRPS|hglE-KS|terpene') ~ 'Lactone hybrid',
is_hybrid & str_detect(product, 'CDPS') & !str_detect(product, 'lanthipeptide|RRE|thiopeptide|proteusin|lassopeptide|sactipeptide|ranthipeptide|thioamitides|RiPP|PKS|NRPS|hglE-KS|terpene') ~ 'Other hybrid',
#
TRUE ~ product
)) %>%
mutate(product = if_else(str_detect(product, '^[:lower:]'), str_to_title(product), product))
bgc_count_data_summary = bgc_count_data_summary %>%
mutate(mag_name = str_replace(contig, '(MAG_\\d+)_\\d+', '\\1'), .before = contig) %>% #create mag name col
left_join(mag_table %>% select(mag_name, domain, phylum), by = 'mag_name') %>% # add phylum, domain cols
relocate(c(domain, phylum), .after = 1) %>% # relocate cols
select(c(phylum, domain, product)) %>% # select some cols
mutate(n_product_in_phylum = 1,
#sm_class = product,
product = if_else(str_detect(product, 'hybrid'), 'Hybrid cluster', product)) %>% # edit product col, add sum col, sm class
group_by(phylum, domain, product) %>%
summarize(n_product_in_phylum = sum(n_product_in_phylum), .groups = 'drop') %>% # sum up each type of bgc in each phylum
left_join(mag_table %>%
select(phylum) %>%
group_by(phylum) %>%
summarize(n_mags_in_phylum = n()),
by = 'phylum') %>% # join col that says how many MAGs are in each phylum
mutate(norm_sm_freq = n_product_in_phylum / n_mags_in_phylum, # divide #bgc / # mag per phylum for each bgc per phylum
product = if_else(product == 'Other', 'Other cluster*', product)) %>%
#
group_by(phylum) %>%
add_tally(sum(norm_sm_freq), name = 'total_sm_freq') %>%
ungroup() %>%
arrange(desc(total_sm_freq), phylum) %>%
mutate(phylum = if_else(is.na(phylum), 'Unclassified', phylum))
# Pie chart of Cariaco MAG BGCs groupped by class -------------------------
final_pie =
bgc_count_data_summary %>%
mutate(product = if_else(product == 'PKS', 'Polyketide', product)) %>%
select(product, n_product_in_phylum) %>%
group_by(product) %>%
summarize(n_product_in_phylum = sum(n_product_in_phylum)) %>%
left_join(
tibble(
product = names(smc_palette),
color = unname(smc_palette)),
by = 'product')
library(plotly) # load plotly for plotting
##
## Attaching package: 'plotly'
## The following object is masked _by_ '.GlobalEnv':
##
## highlight
## The following object is masked from 'package:ComplexHeatmap':
##
## add_heatmap
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
# create the pie chart and save it
final_bgc_pie = plot_ly(final_pie,
labels = ~product,
values = ~n_product_in_phylum,
type = 'pie',
textposition = 'inside',
textinfo = 'label+percent+value',
insidetextfont = list(color = '#FFFFFF'),
text = ~n_product_in_phylum,
marker = list(colors = ~color,
line = list(
color = '#FFFFFF',
width = 1)),
showlegend = FALSE) %>%
layout(xaxis = list(
showgrid = FALSE,
zeroline = FALSE,
showticklabels = FALSE),
yaxis = list(
showgrid = FALSE,
zeroline = FALSE,
showticklabels = FALSE))
final_bgc_pie
#save_image(final_bgc_pie,
# file = '/Users/dmcgrath/Documents/cariaco-tidy-analysis/FINAL_FIGURES_DIR/LAST_bgc_dist_pie_chart.svg',
# width = 600,
# height = 600,
# scale = 20)